Code
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)Train 80 %-> real train 75% ,weight train 5% (weight from t-1)
test 20%
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)stock <- read.csv("data/individual_book_train/stock_100.csv")stock <- stock %>% mutate(
WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)
)
stock <- stock %>% mutate(
BidAskSpread = ask_price1 / bid_price1 - 1
)
stock <- stock %>% mutate(
imbalance = abs((bid_size1 - ask_size1) / (bid_size1 + ask_size1))
)
log_rs <- list()
imba_mean <- vector()
BAS_mean <- vector()
#time_IDs <- unique(stock$time_id)
time_IDs <- unique(stock[, 1])[1:5]
for (i in 1 : length(time_IDs)) {
sec <- stock %>% filter(time_id == time_IDs[i]) %>% pull(seconds_in_bucket)
price <- stock %>% filter(time_id == time_IDs[i]) %>% pull(WAP)
imbad <- stock %>% filter(time_id == time_IDs[i]) %>% pull(imbalance)
BASD <- stock %>% filter(time_id == time_IDs[i]) %>% pull(BidAskSpread)
imba_mean[[i]] <- mean(imbad)
BAS_mean[[i]] <- mean(BASD)
log_r <- log(price[-1] / price[1:(length(price) - 1)])
log_rs[[i]] <- data.frame(time = sec[-1], log_return = log_r)
time.no.change <- (1:600)[!(1:600 %in% log_rs[[i]]$time)]
if (length(time.no.change) > 0) {
new.df <- data.frame(time = time.no.change, log_return = 0)
log_rs[[i]] <- rbind(log_rs[[i]], new.df)
log_rs[[i]] <- log_rs[[i]][order(log_rs[[i]]$time), ]
}
}
vol <- list()
comp_vol <- function(x) {
return(sqrt(sum(x ^ 2)))
}
for (i in 1 : length(log_rs)) {
log_rs[[i]] <- log_rs[[i]] %>% mutate(time_bucket = ceiling(time / 30))
vol[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_vol)
colnames(vol[[i]]) <- c('time_bucket', 'volatility')
}cluster_l <- vector()
for (i in 1:length(vol)) {
if (BAS_mean[[i]] > 0.15){cluster_l <- 4}
else if (imba_mean[[i]] > 0.61) {cluster_l[[i]] <- 3}
else if (imba_mean[[i]] < 0.45) {cluster_l[[i]] <- 2}
else {cluster_l[[i]] <- 1}
}model
spec <- ugarchspec(variance.model = list(model = "eGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(0, 0)),
distribution.model = "norm")
ARMA_GARCH.models <- list()
# filter time 450 for first 75% train
for (i in 1 : length(vol)) {
ARMA_GARCH.models[[i]] <- ugarchfit(spec = spec, data = log_rs[[i]] %>%
filter(time <= 450) %>% pull(log_return),
solver = 'hybrid')
}
# 30weight 120 real predict
n_w = 30
n_p = 120
garch_weight <- vector()
pred1 <- list()
pred1_adjust <- list()
for (i in 1 : length(vol)) {
fitted <- rep(1,n_p)
pred1[[i]] <- data_frame(fitted)
fitted <- rep(1:4)
pred1_adjust[[i]] <- data_frame(fitted)
fspec <- getspec(ARMA_GARCH.models[[i]])
setfixed(fspec) <- as.list(coef(ARMA_GARCH.models[[i]]))
future.path <- fitted(ugarchpath(fspec, n.sim = 150, m.sim = 1000))
future.path[is.na(future.path)] <- 0
garch_weight[i] <- mean(sqrt(colSums(future.path[1:30,]^2)))
pred1_adjust[[i]]$fitted[1] <- mean(sqrt(colSums(future.path[31:60,]^2)))
pred1_adjust[[i]]$fitted[2] <- mean(sqrt(colSums(future.path[61:90,]^2)))
pred1_adjust[[i]]$fitted[3] <- mean(sqrt(colSums(future.path[91:120,]^2)))
pred1_adjust[[i]]$fitted[4] <- mean(sqrt(colSums(future.path[121:150,]^2)))
}vol.train <- list()
vol.val <- list()
vol.w <- list()
for (i in 1 : length(log_rs)) {
vol.train[[i]] <- vol[[i]][1:15, ]
vol.val[[i]] <- vol[[i]][-(1:15), ]
}
list.reg <- list()
stocklm <- stock %>% mutate(time_bucket = ceiling(seconds_in_bucket / 30),
num_order = bid_size1 + ask_size1 + bid_size2 + ask_size2)
len.train <- length(vol.train[[1]]$volatility)
for (i in 1 : length(vol)) {
stats.bucket <- stocklm %>%
filter(time_id == time_IDs[i] & time_bucket != 0) %>%
select(c(BidAskSpread, WAP, num_order, time_bucket))
mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
list.reg[[i]] <- data.frame(volatility = vol.train[[i]]$volatility[-1],
price = mean.price$WAP[1:(len.train - 1)],
order = mean.order$num_order[1:(len.train - 1)],
BidAskSpread = mean.BAS$BidAskSpread[1:(len.train - 1)])
}
lm.models <- list()
for (i in 1 : length(vol)) {
lm.models[[i]] <- lm(volatility ~ price + order + BidAskSpread, list.reg[[i]],
weights = 0.8 ^ (((len.train - 2):0) / 2))
}list.reg.val <- list()
len.val <- length(vol.val[[1]]$volatility)
pred.lm <- list()
for (i in 1 : length(vol)) {
stats.bucket <- stocklm %>%
filter(time_id == time_IDs[i] & time_bucket != 0) %>%
select(c(BidAskSpread, WAP, num_order, time_bucket))
mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
list.reg.val[[i]] <-
data.frame(volatility = vol.val[[i]]$volatility,
price = mean.price$WAP[len.train:(len.train + len.val - 1)],
order = mean.order$num_order[len.train:(len.train + len.val - 1)],
BidAskSpread = mean.BAS$BidAskSpread[len.train:(len.train + len.val - 1)])
pred.lm[[i]] <- predict(lm.models[[i]], newdata = list.reg.val[[i]])
}
pred2 <- pred.lmlist.HAV <- list()
for (i in 1 : length(vol)) {
mean.vol <- rep(0, len.train - 5)
for (j in 1 : 5) {
mean.vol <- mean.vol + vol.train[[i]]$volatility[j : (j + len.train - 6)] / 5
}
list.HAV[[i]] <- data.frame(vol = vol.train[[i]]$volatility[-(1:5)],
vol_1 = vol.train[[i]]$volatility[5:(len.train - 1)],
mean_vol_5 = mean.vol)
}
quar <- list()
comp_quar <- function(x) {
return(length(x) / 3 * sum(x ^ 4))
}
for (i in 1 : length(log_rs)) {
quar[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_quar)
colnames(quar[[i]]) <- c('time_bucket', 'quarticity')
}
HAV.wls.models <- list()
for (i in 1 : length(vol)) {
HAV.wls.models[[i]] <- lm(vol ~ vol_1 + mean_vol_5, list.HAV[[i]],
weights = list.HAV[[i]]$vol_1 /
sqrt(quar[[i]]$quarticity[5:(len.train - 1)]))
}pred.hav.all <- list()
for (j in 1:1) {
pred.hav <- list()
latest_obs <- list()
list_HAV1_cluster <- list()
for (i in 1:length(vol)) {
# This will predict 16, 17, 18, 19, 20
latest_obs[[i]] <- vol.train[[i]]$volatility[11:15]
for (t in 1:5) {
# Compute mean volatility for the last 5 observations
mean.vol <- sum(latest_obs[[i]])/5
# Create data frame with updated vol_1 and mean_vol_5
list_HAV1_cluster[[i]] <- data.frame(
vol_1 = latest_obs[[i]][5],
mean_vol_5 = mean.vol
)
pred.hav[[t]] <- unname(predict(HAV.wls.models[[i]], newdata = list_HAV1_cluster[[i]]))
# Drop the oldest observation and add new predicted value
latest_obs[[i]] <- c(latest_obs[[i]][-1], pred.hav[[t]])
}
#cluster_pred_lm[[j]][[i]] <- latest_obs
}
pred.hav.all[[j]] <- latest_obs
}
#pred.hav.all[[1]][[1]][[1]]
pred3 <- list()
for (i in 1:length(vol)){
pred3[[i]] <- pred.hav.all[[1]][[i]]
}cluster 1,3 = EGARCH + WLR
cluster 2,4 = HAV+ WLR
mix <- list()
for(i in 1:length(vol)){
pred_f <- rep(1,4)
mod_a <- rep(1,4)
mod_b <- rep(1,4)
alpha_w <- rep(1,4)
beta_w <- rep(1,4)
val <- rep(1,4)
time <- c(17,18,19,20)
mix[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
###val
mix[[i]]$val <- vol.val[[i]]$volatility[2:5]
if(cluster_l[[i]] == 2 | cluster_l[[i]] == 4){
mix[[i]]$mod_a <- pred3[[i]][2:5]
mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
pa <- garch_weight[[i]]
pb <- pred2[[i]][[1]]
}
else {
mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
pa <- garch_weight[[i]]
pb <- pred2[[i]][[1]]
}
#16 -> 17
a = 0
b = 1
sm_err = 99999
best_a = 0
for(w in 1:11){
m_cal <- a*pa + b*pb
ab_err <- abs(m_cal - vol.val[[i]]$volatility[[1]])
if(ab_err < sm_err){
sm_err <- ab_err
best_a <- a
}
a <- a+0.1
b <- b-0.1
}
mix[[i]]$alpha_w[[1]] <- best_a
mix[[i]]$beta_w[[1]] <- round(1-best_a,digit = 1)
#17-19 -> 18-20 alpha
for(j in 1:3){
a = 0
b = 1
sm_err = 99999
best_a = 0
for(w in 1:11){
m_cal <- a*mix[[i]]$mod_a[[j]] + b*mix[[i]]$mod_b[[j]]
ab_err <- abs(m_cal - vol.val[[i]]$volatility[[j+1]])
if(ab_err < sm_err){
sm_err <- ab_err
best_a <- a
}
a <- a+0.1
b <- b-0.1
}
mix[[i]]$alpha_w[[j+1]] <- best_a
mix[[i]]$beta_w[[j+1]] <- round(1-best_a,digit = 1)
}
###mix
mix[[i]]$pred_f <- ((mix[[i]]$mod_a*mix[[i]]$alpha_w) + (mix[[i]]$mod_b*mix[[i]]$beta_w))
}value table
mix[[1]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0011024661 0.001102466 0.0012196350 1 0 0.0015946080
2 18 0.0009863879 0.001089846 0.0009863879 0 1 0.0007010344
3 19 0.0007984031 0.001092905 0.0007984031 0 1 0.0007203055
4 20 0.0011301855 0.001092094 0.0011301855 0 1 0.0008264389
[[2]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 3.861298e-05 0.0006596833 3.861298e-05 0.0 1.0 0.0006225340
2 18 6.203687e-04 0.0006611238 2.535723e-04 0.9 0.1 0.0003323998
3 19 3.359788e-04 0.0006642848 2.539023e-04 0.2 0.8 0.0001328152
4 20 1.260451e-04 0.0006634561 1.260451e-04 0.0 1.0 0.0001088957
[[3]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0011106308 0.0006334167 0.0011106308 0.0 1.0 0.0011320570
2 18 0.0009069263 0.0006068875 0.0009069263 0.0 1.0 0.0010788897
3 19 0.0010573743 0.0006246518 0.0010573743 0.0 1.0 0.0007937135
4 20 0.0007921832 0.0006227508 0.0010463319 0.6 0.4 0.0007345000
[[4]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006207086 0.0002220253 0.0006207086 0.0 1.0 0.0004778847
2 18 0.0004352167 0.0002173968 0.0005804300 0.4 0.6 0.0005119734
3 19 0.0004923458 0.0002159725 0.0005614391 0.2 0.8 0.0007271090
4 20 0.0006149317 0.0002173656 0.0006149317 0.0 1.0 0.0001894792
[[5]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0004222089 0.0006701975 0.0004222089 0 1 0.0001584394
2 18 0.0003642260 0.0006730303 0.0003642260 0 1 0.0007691548
3 19 0.0006587597 0.0006587597 0.0004001583 1 0 0.0006672344
4 20 0.0006678949 0.0006678949 0.0003852155 1 0 0.0008105287
plot
all_plot <- list()
for(i in 1:length(vol)){
weight_p = ""
for(j in 1:nrow(mix[[i]])){
weight_p = paste(
weight_p,as.character(mix[[i]]$time[[j]]),"=","(",
as.character(mix[[i]]$alpha_w[[j]],1),":",as.character(mix[[i]]$beta_w[[j]],1),")"
)
}
all_plot[[i]] <- ggplot(mix[[i]], aes(x=time)) +
geom_line(aes(y = val,color = "Real Volatility"))+
geom_line(aes(y = mod_a,color = "Model a"), linetype="twodash")+
geom_line(aes(y = mod_b,color = "Model b"), linetype="twodash")+
geom_line(aes(y = pred_f,color = "Mix Model"), linetype="twodash")+
scale_color_manual(name = "Model", values = c(
"Real Volatility" = "red",
"Model a"="lightblue",
"Model b"="green",
"Mix Model" = "blue"))+
theme_classic()+
labs(
title = paste("Prediction Result\ncluster ",as.character(cluster_l[[i]]),
if(cluster_l[[i]] == 2|cluster_l[[i]] == 4){mod = "HAV + WLR"}
else {mod = "EGARCH + WLR"}),
tag = as.character(i),
caption = paste( "\n\n weight for each time interval : \n",
weight_p
),
subtitle = "each time interval = 30 seconds",
x = "Time interval",
y = "Volatility"
)
}all_plot[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
all_plot[[1]]+transition_reveal(time)#all_plot[[2]]+transition_reveal(time)
#all_plot[[3]]+transition_reveal(time)
#all_plot[[4]]+transition_reveal(time)
#all_plot[[5]]+transition_reveal(time)test_df <- list()
for(i in 1:length(vol)){
pred_f <- rep(NaN,16)
mod_a <- rep(NaN,16)
mod_b <- rep(NaN,16)
alpha_w <- rep(NaN,16)
beta_w <- rep(NaN,16)
val <- rep(NaN,16)
time <- c(1:16)
test_df[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
test_df[[i]]$val <- vol.train[[i]]$volatility[1:16]
test_df[[i]]$val[16] <- vol.val[[i]]$volatility[1]
}
agg_mix <- data.frame(test_df[[1]])
agg_mix <- rbind(agg_mix,mix[[1]])
for(i in 2:length(mix)){
agg_mix <- rbind(agg_mix,test_df[[i]])
agg_mix <- rbind(agg_mix,mix[[i]])
}
agg_mix$time <- c(1:nrow(agg_mix))
agg_mix$c1 <- rep(NaN,nrow(agg_mix))
agg_mix$c2 <- rep(NaN,nrow(agg_mix))
agg_mix$c3 <- rep(NaN,nrow(agg_mix))
agg_mix$c4 <- rep(NaN,nrow(agg_mix))
for(i in 1:length(vol)){
if(cluster_l[[i]]==1){
agg_mix$c1[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
if(cluster_l[[i]]==2){
agg_mix$c2[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
if(cluster_l[[i]]==3){
agg_mix$c3[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
if(cluster_l[[i]]==4){
agg_mix$c4[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
}agg_plot <- ggplot(agg_mix, aes(x=time)) +
geom_line(aes(y = val,color = "Volatility"))+
geom_line(aes(y = c1,color = "Cluster 1"))+
geom_line(aes(y = c2,color = "Cluster 2"))+
geom_line(aes(y = c3,color = "Cluster 3"))+
geom_line(aes(y = c4,color = "Cluster 4"))+
geom_line(aes(y = pred_f,color = "Prediction"), linetype="twodash")+
scale_color_manual(name = "Cluster/Prediction", values = c(
"Volatility" = "lightgrey",
"Cluster 1" = "darkgreen",
"Cluster 2"="gold",
"Cluster 3"="red",
"Cluster 4" = "blue",
"Prediction" = "black"))+
theme_classic()+
labs(
title = paste("Cluster/Prediction Result"),
x = "Time interval",
y = "Volatility",
caption = "each time interval = 30 seconds"
)
agg_plot+transition_reveal(time)